{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:28.412962Z",
     "start_time": "2020-12-09T02:06:28.394133Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'3.6.9'"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from platform import python_version\n",
    "\n",
    "python_version()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:29.362990Z",
     "start_time": "2020-12-09T02:06:29.360699Z"
    }
   },
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Replace the string \"foo\" when it's got non-words or line boundaries to the left and to the right"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:29.726688Z",
     "start_time": "2020-12-09T02:06:29.720088Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "' FOO bar FOO foofoo barfoobar FOO '"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.sub(r'(?:\\W|^)foo(?:\\W|$)',' FOO ','foo bar foo foofoo barfoobar foo')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## String contains regex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:30.106322Z",
     "start_time": "2020-12-09T02:06:30.099422Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(4, 7), match='123'>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# contains\n",
    "re.search('\\d{3}','foo 123 bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:30.280813Z",
     "start_time": "2020-12-09T02:06:30.274751Z"
    }
   },
   "outputs": [],
   "source": [
    "# doesn't contain\n",
    "re.search('\\d{3}','foo 1 23 bar')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## extract first coccurrence of regex in string\n",
    "\n",
    "> only the FIRST match is returned!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:30.816299Z",
     "start_time": "2020-12-09T02:06:30.806457Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(4, 7), match='bar'>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# letter 'b' followed by two characters or numbers\n",
    "pattern = r'b\\w{2}'\n",
    "\n",
    "match = re.search(pattern,\"foo bar baz quux\")\n",
    "match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:30.940101Z",
     "start_time": "2020-12-09T02:06:30.936436Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'bar'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "match.group(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract all occurrences of regex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:31.389281Z",
     "start_time": "2020-12-09T02:06:31.383845Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['bar', 'baz']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = r'b\\w{2}'\n",
    "re.findall(pattern,'foo bar baz quux')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract all regex matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:34:21.200750Z",
     "start_time": "2020-12-09T02:34:21.189201Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<_sre.SRE_Match object; span=(4, 7), match='bar'>,\n",
       " <_sre.SRE_Match object; span=(8, 11), match='baz'>]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = r'b\\w{2}'\n",
    "matches = re.finditer(pattern,'foo bar baz quux')\n",
    "\n",
    "list(matches)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## split string by regex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:31.759124Z",
     "start_time": "2020-12-09T02:06:31.749937Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['foo', 'bar', 'bar', 'quux']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.split('[\\s,]+','foo,bar bar  quux')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## split string by word boundary\n",
    "\n",
    "> The code below will trigger a ValuerError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:32.275587Z",
     "start_time": "2020-12-09T02:06:32.116927Z"
    }
   },
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "split() requires a non-empty pattern match.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-10-79c07f48f05c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mre\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mr'\\b'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'foo,bar   bar-quux'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/jekyll-utils/jekyll-venv/lib/python3.6/re.py\u001b[0m in \u001b[0;36msplit\u001b[0;34m(pattern, string, maxsplit, flags)\u001b[0m\n\u001b[1;32m    210\u001b[0m     \u001b[0;32mand\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mremainder\u001b[0m \u001b[0mof\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mstring\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mreturned\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mfinal\u001b[0m \u001b[0melement\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    211\u001b[0m     of the list.\"\"\"\n\u001b[0;32m--> 212\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0m_compile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxsplit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    213\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    214\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfindall\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstring\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: split() requires a non-empty pattern match."
     ]
    }
   ],
   "source": [
    "re.split(r'\\b','foo,bar   bar-quux')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:32.392371Z",
     "start_time": "2020-12-09T02:06:32.380527Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['foo', 'bar', 'bar', 'quux']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# only word characters\n",
    "re.findall(r'\\w+','foo,bar   bar-quux')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:32.958523Z",
     "start_time": "2020-12-09T02:06:32.955121Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['foo', ',', 'bar', 'bar', '-', 'quux']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.findall(r'\\w+|[^\\w\\s]+','foo,bar   bar-quux')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## non capturing groups"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## lookbehind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:41.153857Z",
     "start_time": "2020-12-09T02:06:41.148579Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'foo bar fooBAR'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = \"(?<=foo)bar\"\n",
    "replacement = \"BAR\"\n",
    "string = \"foo bar foobar\"\n",
    "\n",
    "re.sub(pattern,replacement,string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## negative lookbehind"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:41.763928Z",
     "start_time": "2020-12-09T02:06:41.751811Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'foo BAR foobar'"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = \"(?<!foo)bar\"\n",
    "replacement = \"BAR\"\n",
    "string = \"foo bar foobar\"\n",
    "\n",
    "re.sub(pattern,replacement,string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## lookahead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:42.112598Z",
     "start_time": "2020-12-09T02:06:42.102170Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'foo bar FOObar'"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = \"foo(?=bar)\"\n",
    "replacement = \"FOO\"\n",
    "string = \"foo bar foobar\"\n",
    "\n",
    "re.sub(pattern,replacement,string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## negative lookahead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:42.499787Z",
     "start_time": "2020-12-09T02:06:42.490361Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'FOO bar foobar'"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = \"foo(?!bar)\"\n",
    "replacement = \"FOO\"\n",
    "string = \"foo bar foobar\"\n",
    "\n",
    "re.sub(pattern,replacement,string)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## match vs search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:42.888965Z",
     "start_time": "2020-12-09T02:06:42.879822Z"
    }
   },
   "outputs": [],
   "source": [
    "re.match('abc','xx abc xx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:43.061157Z",
     "start_time": "2020-12-09T02:06:43.051464Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(0, 3), match='abc'>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.match('abc','abc')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:43.220217Z",
     "start_time": "2020-12-09T02:06:43.215292Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(3, 6), match='abc'>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "match = re.search('abc','xx abc xx')\n",
    "match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:43.561744Z",
     "start_time": "2020-12-09T02:06:43.553705Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(3, 6), match='abc'>"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "match = re.search('abc','xx abc xx abc')\n",
    "match"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## full match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:43.967294Z",
     "start_time": "2020-12-09T02:06:43.961173Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(0, 6), match='foo123'>"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.fullmatch('foo\\d+','foo123')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:44.169161Z",
     "start_time": "2020-12-09T02:06:44.165460Z"
    }
   },
   "outputs": [],
   "source": [
    "re.fullmatch('foo\\d+','foo123bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:44.302372Z",
     "start_time": "2020-12-09T02:06:44.286251Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_sre.SRE_Match object; span=(0, 6), match='foo123'>"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.match('^foo\\d+$','foo123')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-12-09T02:06:44.773676Z",
     "start_time": "2020-12-09T02:06:44.767023Z"
    }
   },
   "outputs": [],
   "source": [
    "re.match('^foo\\d+$','foo123bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
